Predict DOC observations from env data.

Train a XGBoost model to predict DOC observations from environmental data used in Wang et al., 2023.

Author

Thelma Panaïotis

source("utils.R")

DOC observation data

Read data

Read data, keep columns of interest and rename them. Keep only observations between 0 and 10 meters.

# Read a first time to get column names
columns <- read_excel(
    "data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx", 
    n_max = 0
  ) %>% 
  colnames() %>% 
  tolower()

# Read a second time to get the data with the proper column names
df_doc <- read_excel(
    "data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx",
    skip = 2, 
    col_names = columns
  )

# Select useful columns
df_doc <- df_doc %>%
  select(
    bottle, bottle_flag_w,
    lon = longitude, lat = latitude, date,
    press = `ctd pressure`,
    doc, doc_flag_w
  ) %>%
  mutate(date = ymd(date)) # warning because some dates are missing

# Keep only observations in the 0-10 m range
df_doc <- df_doc %>% filter(between(press, 0, 10))

Clean data

Remove aberrant doc observations (flag = 9, negative values…).

df_doc <- df_doc %>%
  filter(doc_flag_w != 9) %>% 
  filter(doc > 0) %>% 
  drop_na(doc)

We have 7036 raw observations in the 0-10 meters depth range.

Distribution in space and time

Map

df_doc %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat), size = 0.5) +
  coord_quickmap(expand = 0)

Depth distribution

ggplot(df_doc) + geom_histogram(aes(x = press), bins = 100)

Time distribution

ggplot(df_doc) + geom_histogram(aes(x = date), bins = 100)

summary(df_doc$date)
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1994-03-01" "2004-06-22" "2010-03-20" "2009-01-25" "2013-08-27" "2021-08-13" 
        NA's 
        "28" 

Average by pixel across the 0-10 m layer

First, we need to round coordinates to match them with other environmental data from GLODAP, then we can compute the average across the layer.

df_doc <- df_doc %>%
  mutate(
    # floor and add 0.5 because other env data are on the centre of each pixel
    lon = roundp(lon, precision = 1, f = floor) + 0.5,
    lat = roundp(lat, precision = 1, f = floor) + 0.5
  ) %>%
  group_by(lon, lat) %>%
  summarise(
    # compute mean and sd
    doc_mean = mean(doc),
    doc_sd = sd(doc),
    n_obs = n()
  ) %>%
  ungroup()

We now have 2961 observations. Let’s plot a map!

# Number of observations
ggmap(df_doc, "n_obs", type = "point") +
  ggplot2::scale_colour_viridis_c(option = "cividis", trans = "log10", direction = -1) +
  ggtitle("Number of observations")

# DOC mean
ggmap(df_doc, "doc_mean", type = "point") + 
  ggplot2::scale_colour_viridis_c(option = "A", trans = "log10") +
  ggtitle("DOC mean")

# DOC sd
ggmap(df_doc, "doc_sd", type = "point") + 
  ggplot2::scale_colour_viridis_c() +
  ggtitle("DOC sd")

Let’s also have a look at the distribution of DOC values. If we want to predict these, they should be ~normally distributed.

ggplot(df_doc) + geom_histogram(aes(x = doc_mean))

ggplot(df_doc) + geom_histogram(aes(x = doc_mean)) + scale_x_continuous(trans = "log")

Log-transformation is not perfect but should do!

df_doc <- df_doc %>% mutate(doc_mean_log = log(doc_mean))

Other environmental data (predictors)

Load data, and join with DOC data.

load("data/01.all_env.Rdata")

all <- env %>%
  select(
    lon, lat, 
    temperature,
    phosphate,
    silicate,
    alkalinity,
    dic,
    npp,
    oxygen
  ) %>% 
  drop_na() %>% # drop missing env data
  left_join(df_doc, by = join_by(lon, lat)) %>% 
  drop_na(doc_mean) # drop missing doc data

ggmap(env, var = "phosphate", type = "raster") +
  geom_point(data = all, aes(x = lon, y = lat), size = 0.5)

ggmap(env, var = "alkalinity", type = "raster") +
  geom_point(data = all, aes(x = lon, y = lat), size = 0.5)

Our doc-prediction dataset has 2311 observations.

ggplot(all) + geom_histogram(aes(x = doc_mean_log))

Predict DOC from env

Prepare data

Define variables roles

# For simplicity
df <- all

# Explanatory variables
exp_vars <- df %>%
  select(
    temperature,
    phosphate,
    silicate,
    alkalinity,
    dic,
    npp,
    oxygen
  ) %>%
  colnames()

# Metadata variables
meta_vars <- df %>% select(lon, lat) %>% colnames()

Split data

# Train VS test, stratified
df_split <- initial_split(df, prop = 0.8, strata = doc_mean_log)
df_train <- training(df_split)
df_test <- testing(df_split)

bind_rows(
  df_train %>% mutate(split = "train"),
  df_test %>% mutate(split = "test")
) %>% 
  ggplot() + 
  geom_density(aes(x = doc_mean_log, colour = split))

# Cross-validation, 10 folds, stratified
df_folds <- vfold_cv(df_train, v = 10, strata = doc_mean_log)

Train model

Define model

# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  min_n = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost", num.threads = n_cores)
extract_parameter_set_dials(xgb_spec)
Collection of 4 parameters for tuning

 identifier       type    object
      trees      trees nparam[+]
      min_n      min_n nparam[+]
 tree_depth tree_depth nparam[+]
 learn_rate learn_rate nparam[+]

Recipe & workflow

# Generate formula from list of explanatory variables
# NB: we also add metadata variables and untransformed "doc_mean" for technical reasons but these will not be used to fit the model
xgb_form <- as.formula(paste("doc_mean_log ~ ", paste(c("doc_mean", meta_vars, exp_vars), collapse = " + "), sep = ""))

xgb_rec <- recipe(xgb_form, data = df_train) %>%
  update_role(lon, lat, new_role = "coords") %>% # These variables can be retained in the data but not included in the model
  update_role(doc_mean, new_role = "untransformed outcome")
summary(xgb_rec)
# A tibble: 11 × 4
   variable     type      role                  source  
   <chr>        <list>    <chr>                 <chr>   
 1 doc_mean     <chr [2]> untransformed outcome original
 2 lon          <chr [2]> coords                original
 3 lat          <chr [2]> coords                original
 4 temperature  <chr [2]> predictor             original
 5 phosphate    <chr [2]> predictor             original
 6 silicate     <chr [2]> predictor             original
 7 alkalinity   <chr [2]> predictor             original
 8 dic          <chr [2]> predictor             original
 9 npp          <chr [2]> predictor             original
10 oxygen       <chr [2]> predictor             original
11 doc_mean_log <chr [2]> outcome               original
xgb_wflow <- workflow() %>%
  add_recipe(xgb_rec) %>%
  add_model(xgb_spec)

Gridsearch

xgb_grid <- grid_latin_hypercube(
  trees(),
  learn_rate(),
  tree_depth(),
  min_n(),
  #loss_reduction(),
  #sample_size = sample_prop(),
  #finalize(mtry(), df_train),
  size = 30
)

doParallel::registerDoParallel()
xgb_res <- tune_grid(
  xgb_wflow,
  resamples = df_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)
autoplot(xgb_res)

show_best(xgb_res, metric = "rmse")
# A tibble: 5 × 10
  trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
  <int> <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
1   501    19         13    0.0495  rmse    standard   0.121    10 0.00529
2  1257     6          7    0.00479 rmse    standard   0.122    10 0.00513
3  1963    21         14    0.0831  rmse    standard   0.122    10 0.00529
4  1451    37          2    0.0234  rmse    standard   0.127    10 0.00408
5  1687    33          8    0.00119 rmse    standard   0.527    10 0.00486
# ℹ 1 more variable: .config <chr>
best_xgb <- select_best(xgb_res, metric = "rmse")

Final fit

final_xgb <- finalize_workflow(
  xgb_wflow,
  best_xgb
)

final_res <- fit(final_xgb, df_train)
[12:30:12] WARNING: src/learner.cc:767: 
Parameters: { "num_threads" } are not used.

Evaluate model

Predictions & metrics

test_preds <- augment(final_res, new_data = df_test) %>%
  mutate(split = "test", .before = lon) %>% # tag split
  rename(.pred_logged = .pred) %>% # prediction is actually log of prediction
  mutate(.pred = exp(.pred_logged))

rmse(test_preds, truth = doc_mean_log, estimate = .pred_logged)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.133
rsq(test_preds, truth = doc_mean_log, estimate = .pred_logged)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.852

Pred VS truth

test_preds %>%
  ggplot() +
  geom_point(aes(x = doc_mean_log, y = .pred_logged)) +
  geom_abline(slope = 1, color = "red") +
  labs(title = "Pred VS truth in log-transformed space")

test_preds %>%
  ggplot() +
  geom_point(aes(x = doc_mean, y = .pred)) +
  geom_abline(slope = 1, color = "red") +
  labs(title = "Pred VS truth")

test_preds %>%
  ggplot() +
  geom_point(aes(x = doc_mean_log, y = .pred_logged, colour = abs(lat))) +
  scale_colour_viridis_c() +
  geom_abline(slope = 1, color = "red") +
  labs(title = "Pred VS truth in log-transformed space")

test_preds %>%
  ggplot() +
  geom_point(aes(x = doc_mean, y = .pred, colour = abs(lat))) +
  scale_colour_viridis_c() +
  geom_abline(slope = 1, color = "red") +
  labs(title = "Pred VS truth")

test_preds %>% 
  ggplot() +
  geom_point(aes(x = alkalinity, y = doc_mean, colour = abs(lat))) +
  scale_colour_viridis_c() +
  labs(title = "Truth VS alkalinity")

test_preds %>% 
  ggplot() +
  geom_point(aes(x = alkalinity, y = .pred_logged, colour = abs(lat))) +
  scale_colour_viridis_c() +
  labs(title = "Pred VS alkalinity")

Interpret model

Explainer

Start by generating and explainer.

# Select only predictors
vip_train <- xgb_rec %>% prep() %>% bake(new_data = NULL, all_predictors())

# Explainer
xgb_explain <- explain_tidymodels(
    model = extract_fit_parsnip(final_res),
    data = vip_train,
    y = df_train %>%  pull(doc_mean_log)
  )
Preparation of a new explainer is initiated
  -> model label       :  model_fit  (  default  )
  -> data              :  1847  rows  7  cols 
  -> data              :  tibble converted into a data.frame 
  -> target variable   :  1847  values 
  -> predict function  :  yhat.model_fit  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package parsnip , ver. 1.1.1 , task regression (  default  ) 
  -> predicted values  :  numerical, min =  3.64683 , mean =  4.234957 , max =  6.234434  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.2887175 , mean =  3.348189e-06 , max =  0.3489649  
  A new explainer has been created!  

Variable importance

xgb_var_imp <- model_parts(xgb_explain)
ggplot_imp(xgb_var_imp)

# Extract most important variables
vars_pdp <- xgb_var_imp %>% 
  as_tibble() %>% 
  arrange(desc(dropout_loss)) %>% 
  filter(variable %in% exp_vars) %>% 
  group_by(variable) %>% 
  slice_head(n = 1) %>% 
  ungroup() %>% 
  arrange(desc(dropout_loss)) %>% 
  slice_head(n = n_pdp) %>% 
  pull(variable)

Partial dependence plots

n_pdp <- 3 # only one pdp
plots <- list()
for (i in 1:n_pdp){
  var <- vars_pdp[i]
  plots[[i]] <- plot_pdp(xgb_explain, var)
}
plots
[[1]]


[[2]]


[[3]]

Conclusion

Environmental data can predict 85% of DOC variance in the surface layer.